/**
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

/*!
 * \file kernel_simt_atomic_impl.h
 * \brief
 */
#ifndef ASCENDC_MODULE_SIMT_ATOMIC_IMPL_H
#define ASCENDC_MODULE_SIMT_ATOMIC_IMPL_H

#if defined(ASCENDC_CPU_DEBUG)
#include "kernel_process_lock.h"
#include "kernel_utils.h"
#include "kernel_simt_cpu.h"
#include "stub_def.h"
#endif

namespace AscendC {
namespace Simt {
#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicCasImpl(__gm__ T *address, T compare, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, compare, val]() {
        if (*address == compare) {
            *address = val;
        }
    });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__simt_callee__ __aicore__ inline T AtomicCasImpl(__ubuf__ T *address, T compare, T val)
{
    return bisheng::cce::simt::atomicCAS(address, compare, val);
}

template <typename T>
__simt_callee__ __aicore__ inline T AtomicCasImpl(__gm__ T *address, T compare, T val)
{
    return bisheng::cce::simt::atomicCAS(address, compare, val);
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicAddImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() { *address += val; });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__simt_callee__ __aicore__ inline T AtomicAddImpl(__ubuf__ T *address, T val)
{
    bisheng::cce::simt::atomicAdd(address, val);
    return *address;
}

template <typename T>
__simt_callee__ __aicore__ inline T AtomicAddImpl(__gm__ T *address, T val)
{
    bisheng::cce::simt::atomicAdd(address, val);
    return *address;
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicSubImpl(__gm__ T *address, T val)
{
    return AtomicAddImpl(address, (T)0 - val);
}
#else
template <typename T>
__simt_callee__ __aicore__ inline T AtomicSubImpl(__ubuf__ T *address, T val)
{
    bisheng::cce::simt::atomicAdd(address, -val);
    return *address;
}

template <typename T>
__simt_callee__ __aicore__ inline T AtomicSubImpl(__gm__ T *address, T val)
{
    bisheng::cce::simt::atomicAdd(address, -val);
    return *address;
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicExchImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() { *address = val; });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__simt_callee__ __aicore__ inline T AtomicExchImpl(__ubuf__ T *address, T val)
{
    return bisheng::cce::simt::atomicExch(address, val);
}

template <typename T>
__simt_callee__ __aicore__ inline T AtomicExchImpl(__gm__ T *address, T val)
{
    return bisheng::cce::simt::atomicExch(address, val);
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicMaxImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() {
        if (*address < val) {
            *address = val;
        }
    });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__simt_callee__ __aicore__ inline T AtomicMaxImpl(__ubuf__ T *address, T val)
{
    bisheng::cce::simt::atomicMax(address, val);
    return *address;
}

template <typename T>
__simt_callee__ __aicore__ inline T AtomicMaxImpl(__gm__ T *address, T val)
{
    bisheng::cce::simt::atomicMax(address, val);
    return *address;
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicMinImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() {
        if (*address > val) {
            *address = val;
        }
    });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__simt_callee__ __aicore__ inline T AtomicMinImpl(__ubuf__ T *address, T val)
{
    bisheng::cce::simt::atomicMin(address, val);
    return *address;
}

template <typename T>
__simt_callee__ __aicore__ inline T AtomicMinImpl(__gm__ T *address, T val)
{
    bisheng::cce::simt::atomicMin(address, val);
    return *address;
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicIncImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() {
        if (*address >= val) {
            *address = (T)0;
        } else {
            *address += (T)1;
        }
    });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename DstType, typename SrcType>
__aicore__ inline DstType AtomicIncImpl_(SrcType *address, DstType val)
{
    DstType old = *address;
    DstType cmp;
    DstType newVal;
    do {
        cmp = old;
        if (old >= val) {
            newVal = (DstType)0;
        } else {
            newVal = old + (DstType)1;
        }
        old = AtomicCasImpl(address, cmp, newVal);
    } while (cmp != old);
    return old;
}

template <typename T>
__aicore__ inline T AtomicIncImpl(__ubuf__ T *address, T val)
{
    return AtomicIncImpl_<T, __ubuf__ T>(address, val);
}

template <typename T>
__aicore__ inline T AtomicIncImpl(__gm__ T *address, T val)
{
    return AtomicIncImpl_<T, __gm__ T>(address, val);
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicDecImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() {
        if (*address == (T)0 || *address > val) {
            *address = val;
        } else {
            *address -= (T)1;
        }
    });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename DstType, typename SrcType>
__aicore__ inline DstType AtomicDecImpl_(SrcType *address, DstType val)
{
    DstType old = *address;
    DstType cmp;
    DstType newVal;
    do {
        cmp = old;
        if (old == (DstType)0 || old > val) {
            newVal = val;
        } else {
            newVal = old - (DstType)1;
        }
        old = AtomicCasImpl(address, cmp, newVal);
    } while (cmp != old);
    return old;
}

template <typename T>
__aicore__ inline T AtomicDecImpl(__ubuf__ T *address, T val)
{
    return AtomicDecImpl_<T, __ubuf__ T>(address, val);
}

template <typename T>
__aicore__ inline T AtomicDecImpl(__gm__ T *address, T val)
{
    return AtomicDecImpl_<T, __gm__ T>(address, val);
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicAndImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val, ret]() { *address = ret & val; });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename DstType, typename SrcType>
__aicore__ inline DstType AtomicAndImpl_(SrcType *address, DstType val)
{
    DstType old = *address;
    DstType cmp;
    DstType newVal;
    do {
        cmp = old;
        newVal = (DstType)(old & val);
        old = AtomicCasImpl(address, cmp, newVal);
    } while (cmp != old);
    return old;
}

template <typename T>
__aicore__ inline T AtomicAndImpl(__ubuf__ T *address, T val)
{
    return AtomicAndImpl_<T, __ubuf__ T>(address, val);
}

template <typename T>
__aicore__ inline T AtomicAndImpl(__gm__ T *address, T val)
{
    return AtomicAndImpl_<T, __gm__ T>(address, val);
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicOrImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val, ret]() { *address = ret | val; });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename DstType, typename SrcType>
__aicore__ inline DstType AtomicOrImpl_(SrcType *address, DstType val)
{
    DstType old = *address;
    DstType cmp;
    DstType newVal;
    do {
        cmp = old;
        newVal = (DstType)(old | val);
        old = AtomicCasImpl(address, cmp, newVal);
    } while (cmp != old);
    return old;
}

template <typename T>
__aicore__ inline T AtomicOrImpl(__ubuf__ T *address, T val)
{
    return AtomicOrImpl_<T, __ubuf__ T>(address, val);
}

template <typename T>
__aicore__ inline T AtomicOrImpl(__gm__ T *address, T val)
{
    return AtomicOrImpl_<T, __gm__ T>(address, val);
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicXorImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val, ret]() { *address = ret ^ val; });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename DstType, typename SrcType>
__aicore__ inline DstType AtomicXorImpl_(SrcType *address, DstType val)
{
    DstType old = *address;
    DstType cmp;
    DstType newVal;
    do {
        cmp = old;
        newVal = (DstType)(old ^ val);
        old = AtomicCasImpl(address, cmp, newVal);
    } while (cmp != old);
    return old;
}

template <typename T>
__aicore__ inline T AtomicXorImpl(__ubuf__ T *address, T val)
{
    return AtomicXorImpl_<T, __ubuf__ T>(address, val);
}

template <typename T>
__aicore__ inline T AtomicXorImpl(__gm__ T *address, T val)
{
    return AtomicXorImpl_<T, __gm__ T>(address, val);
}
#endif

}  // namespace Simt
}  // namespace AscendC
#endif  // ASCENDC_MODULE_SIMT_ATOMIC_IMPL_H
